{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "plt.rcParams['font.sans-serif']=['SimHei']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "import seaborn as sns; sns.set()\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   category                                            content\n",
       "0         2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1         2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2         1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3         2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4         2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=pd.read_csv(\"./data/clustering_training.csv\",names=['category','content'],encoding='utf-8',header=None)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 分词（20分）：由于企业描述是文本信息，需要对文本信息进行特征提取。文本分词可采用Jieba分词： "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 2)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 第一部分：分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "contents=data.content.values.tolist()# 使用jieba分词器要求传入的数据格式为list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "content_s=[]\n",
    "for line in contents:\n",
    "    current_segment=jieba.lcut(line)\n",
    "    if len(current_segment)>1 and current_segment!=\"\\n\":\n",
    "        content_s.append(current_segment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774,)"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.array(content_s).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content_s</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           content_s\n",
       "0  [合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...\n",
       "1  [公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...\n",
       "2  [公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...\n",
       "3  [公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...\n",
       "4  [该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ..."
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_content=pd.DataFrame({'content_s':content_s})\n",
    "df_content.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 第二部分：去掉停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stopwords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\"</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>#</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>$</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>%</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stopwords\n",
       "0         !\n",
       "1         \"\n",
       "2         #\n",
       "3         $\n",
       "4         %"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 导入停用词库\n",
    "stopwords=pd.read_csv('./data/stopwords2.txt',index_col=False,sep='\\t',quoting=3,names=['stopwords'],encoding='gbk')\n",
    "stopwords.head()\n",
    "# pandas.read_csv参数整理：https://www.cnblogs.com/datablog/p/6127000.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去掉停用词\n",
    "# 自定义一个函数\n",
    "def drop_stopwords(contents,stopwords):\n",
    "    content_clean=[]\n",
    "    all_words=[]\n",
    "    for line in contents:\n",
    "        line_clean=[]\n",
    "        for word in line:\n",
    "            if word  in stopwords:\n",
    "                continue\n",
    "            line_clean.append(word)\n",
    "            all_words.append(str(word))\n",
    "        content_clean.append(line_clean)\n",
    "    return content_clean,all_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "contents=df_content.content_s.values.tolist()\n",
    "stopwords=stopwords.stopwords.values.tolist()\n",
    "content_clean,all_words=drop_stopwords(contents,stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>contents_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[合晟, 资产, 一家, 专注, 股票, 债券, 二级, 市场, 投资, 合格, 投资者, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[公司, 主营业务, 微, 企业, 个体, 工商户, 农户, 客户, 提供, 贷款, 服务,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[公司, 立足于, 商业地产, 服务, 致力于, 商业地产, 开发, 销售, 运营, 全, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[公司, 工商管理, 部门, 核准, 经营范围, 投资, 咨询, 经济, 信息, 咨询, 企...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[公司, 主营业务, 中国, 境内, 港, 澳, 台, 保险代理, 销售, 依托, 产品, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      contents_clean\n",
       "0  [合晟, 资产, 一家, 专注, 股票, 债券, 二级, 市场, 投资, 合格, 投资者, ...\n",
       "1  [公司, 主营业务, 微, 企业, 个体, 工商户, 农户, 客户, 提供, 贷款, 服务,...\n",
       "2  [公司, 立足于, 商业地产, 服务, 致力于, 商业地产, 开发, 销售, 运营, 全, ...\n",
       "3  [公司, 工商管理, 部门, 核准, 经营范围, 投资, 咨询, 经济, 信息, 咨询, 企...\n",
       "4  [公司, 主营业务, 中国, 境内, 港, 澳, 台, 保险代理, 销售, 依托, 产品, ..."
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_content1=pd.DataFrame({'contents_clean':content_clean})\n",
    "df_content1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分词任务完成"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 特征提取（20分）： 去掉停用词后（stopwords.txt），采用TFIDF作为每个文本的特征描述。 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'合晟 资产 一家 专注 股票 债券 二级 市场 投资 合格 投资者 提供 专业 资产 管理 服务 企业 公司 业务范围 包括 资产 管理 投资 咨询 投资 顾问 服务 公司 管理 私募 基金 产品 包括 股票 型 债券 型 资产 管理 计划 证券 投资 基金 管理 总资产 规模 80 亿元 中国 证券 投资 基金业 协会 数据 公司 管理 私募 证券 投资 基金 顾问 管理 类 规模 较大 公司 管理 规模 处于 50 亿元 第一 梯队'"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 准备数据\n",
    "X_train=df_content1.contents_clean\n",
    "words=[]\n",
    "for line_index in range(len(X_train)):\n",
    "    try:\n",
    "        words.append(' '.join(X_train[line_index]))# 将数据格式转化成字符串的格式，因为TfidfVectorizer算法要求是这样的格式\n",
    "    except:\n",
    "        print(line_index)\n",
    "words[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入工具包,构建tfidf特征\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "Vectorizer=TfidfVectorizer(analyzer='word',max_features=20,lowercase=False)\n",
    "X=Vectorizer.fit_transform(words).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 20)"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>专业</th>\n",
       "      <th>业务</th>\n",
       "      <th>主营业务</th>\n",
       "      <th>产品</th>\n",
       "      <th>企业</th>\n",
       "      <th>公司</th>\n",
       "      <th>包括</th>\n",
       "      <th>客户</th>\n",
       "      <th>技术</th>\n",
       "      <th>提供</th>\n",
       "      <th>收入</th>\n",
       "      <th>服务</th>\n",
       "      <th>生产</th>\n",
       "      <th>研发</th>\n",
       "      <th>系统</th>\n",
       "      <th>经营</th>\n",
       "      <th>行业</th>\n",
       "      <th>设备</th>\n",
       "      <th>销售</th>\n",
       "      <th>领域</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.291244</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.195930</td>\n",
       "      <td>0.222559</td>\n",
       "      <td>0.517844</td>\n",
       "      <td>0.582875</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.224889</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.410823</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.644582</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.353977</td>\n",
       "      <td>0.205906</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.426784</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.357683</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.326704</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.194661</td>\n",
       "      <td>0.510485</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.173058</td>\n",
       "      <td>0.194790</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.150311</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.686462</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.373893</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.433988</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.238328</td>\n",
       "      <td>0.554535</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.240823</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.439930</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.341682</td>\n",
       "      <td>0.281185</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.214825</td>\n",
       "      <td>0.207716</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.274497</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.238416</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.653301</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.593053</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         专业        业务      主营业务        产品        企业        公司        包括  \\\n",
       "0  0.291244  0.000000  0.000000  0.195930  0.222559  0.517844  0.582875   \n",
       "1  0.000000  0.000000  0.644582  0.000000  0.353977  0.205906  0.000000   \n",
       "2  0.194661  0.510485  0.000000  0.000000  0.000000  0.173058  0.194790   \n",
       "3  0.000000  0.000000  0.433988  0.000000  0.238328  0.554535  0.000000   \n",
       "4  0.000000  0.000000  0.214825  0.207716  0.000000  0.274497  0.000000   \n",
       "\n",
       "         客户   技术        提供   收入        服务   生产   研发   系统        经营        行业  \\\n",
       "0  0.000000  0.0  0.224889  0.0  0.410823  0.0  0.0  0.0  0.000000  0.000000   \n",
       "1  0.426784  0.0  0.357683  0.0  0.326704  0.0  0.0  0.0  0.000000  0.000000   \n",
       "2  0.000000  0.0  0.150311  0.0  0.686462  0.0  0.0  0.0  0.000000  0.000000   \n",
       "3  0.000000  0.0  0.240823  0.0  0.439930  0.0  0.0  0.0  0.341682  0.281185   \n",
       "4  0.000000  0.0  0.238416  0.0  0.653301  0.0  0.0  0.0  0.000000  0.000000   \n",
       "\n",
       "    设备        销售   领域  \n",
       "0  0.0  0.000000  0.0  \n",
       "1  0.0  0.000000  0.0  \n",
       "2  0.0  0.373893  0.0  \n",
       "3  0.0  0.000000  0.0  \n",
       "4  0.0  0.593053  0.0  "
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_train=pd.DataFrame(columns=Vectorizer.get_feature_names(),data=X)\n",
    "new_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从上面的结果可以看出，得到tfidf的结果是一个非常稀疏的矩阵。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 采用KMeans聚类算法，根据第2 步得到特征对企业进行聚类， 尝试K=5，10，15，20，30，..., 50, 并选择合适的度量指标，选择最佳的K。（60分） "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入相应工具包\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据\n",
    "x_train=new_train\n",
    "# y_train=data.category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.29124426, 0.        , 0.        , ..., 0.        , 0.        ,\n",
       "        0.        ],\n",
       "       [0.        , 0.        , 0.64458154, ..., 0.        , 0.        ,\n",
       "        0.        ],\n",
       "       [0.19466135, 0.51048482, 0.        , ..., 0.        , 0.37389322,\n",
       "        0.        ],\n",
       "       ...,\n",
       "       [0.44410459, 0.        , 0.        , ..., 0.        , 0.14216801,\n",
       "        0.        ],\n",
       "       [0.        , 0.14293407, 0.        , ..., 0.        , 0.        ,\n",
       "        0.16477106],\n",
       "       [0.21820823, 0.19074492, 0.15182084, ..., 0.        , 0.        ,\n",
       "        0.21988628]])"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 因为要计算样本之间的距离\n",
    "# 对每个样本数据进行归一化，每个样本的模长为1.\n",
    "normalize(x_train, norm=\"l2\", copy=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "# KMeans聚类\n",
    "# 一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K, X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)# 这里是fit_predict方法\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    #亦可采用无参考默的评价指标：轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score: {}\".format(si_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 394.29553680531734\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 338.8450155030921\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 272.99182973300515\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 235.9169895494328\n",
      "K-means begin with clusters: 25\n",
      "CH_score: 199.83448564284038\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 174.7920340079772\n",
      "K-means begin with clusters: 35\n",
      "CH_score: 158.0368844694955\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 148.13423091472518\n",
      "K-means begin with clusters: 45\n",
      "CH_score: 133.07165519361322\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 123.44759215784723\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5,10,15, 20, 25,30,35,40,45,50]\n",
    "CH_scores = []\n",
    "#si_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, x_train)\n",
    "    CH_scores.append(ch)\n",
    "    #si_scores.append(si)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXkAAAEBCAYAAACdctWRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xt0nXWd7/H3sy/J3rnfdi7NrWmT/Jqm15QWylXloswMoiOCgAizRh3P8TIeZZyj6GHpDJ7x6MFZujwja4SlR0RguDgKyEEKCEoL2Bs0bX9NaRJ6b25tmjRJczt/7N00QUp30iRP9rM/r7Wy0jx5kuebX3c++8n3+e3n54yOjiIiIt7kc7sAERGZOQp5EREPU8iLiHiYQl5ExMMU8iIiHqaQFxHxMIW8iIiHKeRFRDxMIS8i4mEKeRERD1PIi4h4WMCFY6YCq4GDwLALxxcRSUR+oAR4DRiI94vcCPnVwEsuHFdExAsuAf4Q785uhPxBgK6uXkZGEvcOmPn5GXR09Lhdxpyh8ThNYzGRxmOiqY6Hz+eQm5sOsQyNlxshPwwwMjKa0CEPJHz9003jcZrGYiKNx0TnOB6TanPrwquIiIcp5EVEPEwhLyLiYQp5EREPU8iLiHiYQl5ExMMSKuS37engG/e+wsBJvVBWRCQeCRXyfp/D/rZetjV3uF2KiEhCSKiQr63IIT0UYNOuNrdLERFJCAkV8n6fjxXVBWzd3cHQ8Ijb5YiIzHkJFfIADbURTgwMYd866nYpIiJzXsKFfH1VHilBn1o2IiJxiOsGZcaYa4A7gXTgGWvt3xtjrgDuBsLAQ9bar89cmaelBP0srcpnU1MbN19Vi89xZuOwIiIJ6axn8saYBcCPgQ8By4AGY8zVwH3AtUAdsDq2bVY01EY41nOS5gPds3VIEZGEFE+75sNEz9T3WWsHgRuAE0CTtbbZWjsE3A98dAbrnGBZdT5+n6OWjYjIWcQT8tWA3xjza2PMFuC/AvOYeOP6g0DZDNT3jtJDQRZV5LBpVxujo7pPtYjImcTTkw8AlwLvAXqAXwN9wPh0dYBJzWnMz8+YzO5/5tJV5fzbo6/TNwKVxZnn9L2mKhJx57hzlcbjNI3FRBqPiWZzPOIJ+UPAs9baNgBjzONEWzPj7y1QDByYzIE7OnrOaXWU6liwr9vQwjUXVU35+0xVJJJJW9vxWT/uXKXxOE1jMZHGY6KpjofP50zp5Dieds0TwPuNMTnGGD9wNfAIYIwx1bFtNwG/nfTRz0FuZioL52WxaVf7bB5WRCShnDXkrbWvAP+L6Org24FW4N+A24BHY9t2Eg3+WdVQG6H18HHaj/XN9qFFRBJCXPPkrbX3EZ0yOd46YPm0VzQJDbUR/uOFN9m8q50rV5e7WYqIyJyUcK94Ha8oL43SgnRNpRQROYOEDnmAlbURdu07SveJk26XIiIy5yR8yK+qjTA6ClubdAFWROTtEj7kK4oyyM8KqWUjIvIOEj7kHcehoTZCY0sXfQNDbpcjIjKnJHzIAzTUFjA0PMK25k63SxERmVM8EfI1ZTlkpgXVshEReRtPhLzP57CiuoDX32zXsoAiIuN4IuQh+sKovoFhdrR2uV2KiMic4ZmQXzw/l9QUPxutWjYiIqd4JuSDAT/LFuSzpantnO5uKSLiJZ4JeYi2bLpPDLJ7/zG3SxERmRM8FfLLFuYT8GtZQBGRUzwV8uHUAHWVeVoWUEQkxlMhD9EXRrUf62fvkR63SxERcZ3nQn5FTQQH1LIREcGDIZ+dnkJ1WbaWBRQRwYMhD9FZNvvaejhyVMsCikhy82zIA2zSC6NEJMl5MuQjOWHKCzPY1KSQF5Hk5smQh+jZ/Jv7jnGsV8sCikjy8nTIjwKbdTYvIknMsyFfFkknkqNlAUUkuXk25E8tC7ijpYsT/VoWUESSk2dDHqItm+GRUV7foznzIpKcPB3yC0uzyUpP0QujRCRpeTrkfY7DypoC3tjTweDQsNvliIjMOk+HPERbNgMnh2ls0bKAIpJ8PB/ydZW5hFP9mmUjIknJ8yEf8PtYtrCALU3tDI+MuF2OiMis8nzIQ7Rl09M3yO59WhZQRJJLUoT80gV5BPw+NqplIyJJJilCPpQSoH5+Lpu1LKCIJJmkCHmItmw6ugd467CWBRSR5BGIZydjzPNAITAY2/R3wELg60AQ+Fdr7Y9mpMJpsrymAOdp2LirjcriTLfLERGZFWc9kzfGOEAtsNxau8JauwLYB9wFXAysAD5tjFk8o5Weo6y0FGrLctisvryIJJF42jUm9v4ZY8xWY8zngCuA56y1ndbaXuAR4LqZKnK6NNRG2N/ey+HOE26XIiIyK+Jp1+QC64DPE23NvAA8BBwct89BYM1kDpyfnzGZ3afFFRfM55frmrD7u1liis75+0UiavuMp/E4TWMxkcZjotkcj7OGvLV2PbD+1MfGmHuBu4F/HrebA0zqlUYdHT2MjMzuTBcHqCzK5KXN+7h0afE5fa9IJJO2tuPTU5gHaDxO01hMpPGYaKrj4fM5Uzo5jqcnf7Ex5vJxmxygBSgZt60YODDpo7ugobaANw9003V8wO1SRERmXDw9+Rzgu8aYkDEmE7gV+DhwuTEmYoxJAz4CPD2DdU6bhtoIoGUBRSQ5nDXkrbVPAE8Cm4GNwH3W2j8CdwDPA1uAB6y1r85kodNlXkE6Rblh3bBMRJJCXPPkrbXfAL7xtm0PAA/MRFEz6dSygM+8tpfe/kHSQ0G3SxIRmTFJ84rX8U4tC7h1t1aMEhFvS8qQr5qXRXaGlgUUEe9LypD3xVo22/Z0MDCoZQFFxLuSMuQh2rI5OTRCY3On26WIiMyYpA15U55DeiigWTYi4mlJG/KnlgXcurudoWEtCygi3pS0IQ/Rlk1v/xC79h51uxQRkRmR1CG/ZEEeKQGfWjYi4llJHfKpQT/1VXlsbmpnRMsCiogHJXXIQ7Rl03V8gJaDukueiHhP0of88uoCfI6jlo2IeFLSh3xGOIipyFHIi4gnJX3IQ7Rlc6jzBAfae90uRURkWinkOX2PeZ3Ni4jXKOSB3MxUqkqyFPIi4jkK+ZiG2gJaDh2ns7vf7VJERKaNQj5GLRsR8SKFfExJfjol+WkKeRHxFIX8OA21EXbtPUZP36DbpYiITAuF/DgNtRFGRkfZ0qQVo0TEGxTy48wvziQvK1UtGxHxDIX8OI7jsLImQmNLJwMntSygiCQ+hfzbNNRGGBwa4Y09HW6XIiJyzhTyb1Nbnk1GOMimJrVsRCTxKeTfxu/zsbw6n627O7QsoIgkPIX8O2iojdA3MMTOt7rcLkVE5Jwo5N9B/fw8UoN+Nlm1bEQksSnk30FK0M+SBVoWUEQSn0L+DBpqIxzrPcme/d1ulyIiMmUK+TNYvjAfv0/LAopIYlPIn0FaKMiiylw27WpjVC0bEUlQCvl30VAb4cjRPva3aVlAEUlMCvl3sbKmAAfdY15EElcg3h2NMd8DCqy1txljVgA/AbKAF4HPWGuHZqhG1+RkpLKgNLos4AcvrnK7HBGRSYvrTN4Yczlw67hN9wOfs9bWAg7wqRmobU5oqI3w1pEe2o72uV2KiMiknTXkjTF5wF3At2MfVwJha+2G2C4/BT46UwW67dSygJvVshGRBBTPmfw9wB3Aqdf4zwMOjvv8QaBsmuuaM4py0yiNpKsvLyIJ6V178saYTwJ7rbXrjDG3xTb7gPFzCh1g0nfyys/PmOyXuObiFaU8/OwugqEUcjJTx7ZHIpkuVjX3aDxO01hMpPGYaDbH42wXXm8ASowxW4A8IINowJeM26cYODDZA3d09DAykhjzzxeVZjM6CuteaeHS5fOA6H9SW9txlyubOzQep2ksJtJ4TDTV8fD5nCmdHL9ru8Zae6W1dom1dgXwP4BfW2v/Bug3xlwU2+0W4LeTPnICqSjKID8rpJaNiCScqc6Tvxn4vjFmJ9Gz+x9MX0lzj+M4NNRG2N7SSd+A52aKioiHxT1P3lr7U6IzabDWbgXWzExJc1NDbQG/+9Ne3tjTwZq6IrfLERGJi17xGqeashwy04Jq2YhIQlHIx8nnc1hZU8Drb3YwOKRlAUUkMSjkJ6GhNkL/yWF2tHa6XYqISFwU8pNQV5lHKMWvlo2IJAyF/CQEAz6WLcxnc1M7wwkyx19EkptCfpIaaiMcPzHIzha1bERk7lPIT9LSBfkE/A4vbNrHsd6TDA4Nu12SiMgZxT1PXqLCqQHq5+fx9PoWnl7fAkDA7xBODYy9pY392/+2j8+8PSXgw3EcN380EfEghfwU3Hr1IvZ19nG4rYe+gaGxtxMDQ/QNDNM3MER37wlODAzRfzK67Wz8PufMTwwpAcIh/zs8iQQoj2SQmuKfhZ9aRBKRQn4KcjJSqakqiPsmQyOjo/THwv/0k8G4t5PDE7f3R9+3He0ft88Q77SeeCQnxJeuX0FRXto0/5Qi4gUK+VngcxzSQgHSQlMf7tHRUQYGh+kbGB57Mujs7uf+Z3Zx18838oXrllFdmj2NVYuIF+jCa4JwHIdQSoDczFRKC9KpLs1mTV0Rd3xiFWmhAN/95WY2Ws3fF5GJFPIJrig3ja/dsoqKwgz+z+Nv8Oyf9rpdkojMIQp5D8hKS+H2G1eyoqaAB55t4sF1TYy8UwNfRJKOQt4jUoN+PvvhpVzeUMYzr+3lx//ZqDn8IqILr17i8zncdGUN+dkhHn5+N8d6Bvj8R5aREQ66XZqIuERn8h7jOA4fOL+Cz1xbT/PBbr798420He1zuywRcYlC3qPW1BXx5RtW0N17krt+vpHmg91ulyQiLlDIe5ipyOVrt6wi6PfxnQc28fqb7W6XJCKzTCHvcfMK0rnjE6soyUvnB4+8we+37He7JBGZRQr5JJCTkco/3ryS+qo8fva05bEX32RUUyxFkoJCPkmEUgJ84bqlXLq8hCdebuUnT+xgaFhr1Yp4naZQJhG/z8etH1hEflaIx19q5mjPAJ/98NJzuqeOiMxtOpNPMo7jcM1FVfztX9axa+9R/uUXG+ns7ne7LBGZIQr5JHXR0hK+eP1y2o/1c9fPN7L3SI/bJYnIDFDIJ7H6+Xl89eOrAPiXX2xku9atFfEchXySKy/M4I5bVpGXFeL7D2/lj28cdLskEZlGCnkhLyvEV29eRW15Dvc+uYPf/LFZUyxFPEIhLwCkhQL8t+uXs7a+iMdfauZnT+9keERTLEUSnebOyZiA38cn/2ox+dkhnni5la7jJ/kvH6onlKKHiUii0pm8TOA4Dn996UI+8QFDY3Mn3/nFZo71DLhdlohMkUJe3tF7VpTyheuWcrCzl3/+vxs50N7rdkkiMgUKeTmjZQsL+MebGhgcGuZ/3r+RXXuPul2SiEySQl7eVVVJFnd84jwy01L43oObeXXHYbdLEpFJiOuKmjHmW8B1wChwr7X2bmPMFcDdQBh4yFr79ZkrU9wUyQnztVtW8cNHX+fH/9lIZ/cA719TjuM4bpcmImdx1jN5Y8xlwPuAZcB5wOeNMcuB+4BrgTpgtTHm6pksVNyVEQ5y+8dWcN6iQh5+fjcPPNvEyIjm0ovMdWcNeWvt74H3WmuHgEKiZ/85QJO1tjm2/X7gozNaqbguGPDzmWvruWp1Oes27uNHj7/BwOCw22WJyLuIqydvrR00xnwT2A6sA+YB41//fhAom/7yZK7xOQ4fu7yGG6+oYUtTO9/75Wa6T5x0uywROQNnMi9fN8akAb8BXgSqrbW3xLZfCXzZWvuBOL7NfKB58qXKXPPy6wf437/YSH5OmK/eupqqedlulySSDKqAlnh3PuuFV2PMIiBkrd1irT1hjHmM6EXY8X+nFwMHJlNlR0dPQvd0I5FM2tqOu12Gq2pKMrn9xpX84JHX+fu7X+CiJSV86JIq8rJCbpfmKj02JtJ4TDTV8fD5HPLzMyb9dfHMrlkAfNMYczHR2TXXAvcA3zXGVBM9K7+J6IVYSTLVpdl8+9MX8NyWAzzxhz28suMwV55Xzl9cUKkVp0TmgHguvD4FPAlsBjYCL1trHwRuAx4l2qffCTwyc2XKXJYRDvK3H1zCtz91AatMhKc2tPLf71nP717by+CQbnIm4qZJ9eSnyXygWe0abxk/Hq2HjvPw87vZ0dpFQXaIj1y2kNV1hfiSZF69HhsTaTwmmoZ2zaR68nrFq0y7yuJMbv/YCr50/XLCqQHu+XUj//SzP7Gjtcvt0kSSjpqmMiMcx2HJgnwWz89jfeMhHn9pD9/95WaWLcznussWUlY4+QtIIjJ5CnmZUT6fw0VLS1hTV8izG/fxxMut3Hnfq1y0VDNxRGaDQl5mRTDg5+rzK7lk2TyeXN/Cuo37NBNHZBboN0tmVUY4yA3vq+HyhjIef2kPT21o5cWtB7jmwvm8Z2UpwYAuE4lMJ/1GiSsKcsJ86pp67rxtNeWFGfxyXRN3/PsGXtl+mBEtIi4ybRTy4irNxBGZWWrXiOs0E0dk5ijkZc54+0ycJzUTR+ScKeRlztFMHJHpo98WmbM0E0fk3Om3ROa88TNxKoo0E0dkMhTykjCiM3FW8qUbNBNHJF5q10jCWVIVnYmzofEQj72omTgi70YhLwnJ5zhcuKSE1YsKWbdxP0+83MKd973Kospc1tYXs8pECKfq4S2i3wJJaMGAnw+cX8HFy0pYt3Ef67cd4r6ndnD/M5YVNQVcuKSYxfPzCPjVmZTkpJAXT8gIB7n24io+eNF83jzQzfrGQ7y6/TCv7jhCZlqQNXVFXLikmPnFmThJsniJCCjkxWMcx6G6NJvq0mxuvLyGN/Z0sL7xML/fcoB1G/dRlJfG2voi1tYXE8kJu12uyIxTyItnBfw+VtZEWFkT4UT/IH+ybWxoPMSvXmrmVy81U12Wzdr6YlYvKiQjHHS7XJEZoZCXpJAWCnLp8nlcunweHcf62bD9EOsbD/Pz/2d54He7WLYwnwuXFLNsYYFeZCWeopCXpJOfHeIv187nLy6o5K3DPaxvPMQr2w+zuamdtNQA5y0qZG19ETXlOUmz+Lh4l0JekpbjOFQWZ1JZnMn1761me2sn67cd5pXth3lx6wHys0JcUB+9YFuSn+52uSJTopAXIXoHzCVV+Sypymfg5DCbmtpY33iIpza08uT6ViqLM1lbX8z5i4vITk9xu1yRuCnkRd4mNcXP2vpi1tYXc6xngFd2HGF94yEeXNfEw8/tZnFV9AVXDTURUlP8bpcr8q4U8iLvIjsjlatWl3PV6nIOtPeyvvEQGxoP8e+/2U5q0E9DbYS1S4pYXJmHz6f+vcw9CnmROM0rSOcjly3kw5cuoGnvUdY3Hua1ndGz/OyMFM6vi86/LyjQ/XNk7lDIi0ySz3EwFbmYilxuvrKGrbs7WN94iHUb9/HMa3uZV5BOcV4akZwQhTlhIjlhIrlh8rNCur2CzDqFvMg5CAb8nLeokPMWFdLTN8hrO49g9x5j7+FuXn+zg6HhkbF9HQfys0LR0M8JR58EcqNPBpGcMOkhvSBLpp9CXmSaZISDvHdlKddftYi2tuOMjI5yrOckbUf7ONLVR9vR02+bm9o4fmJwwtenhwIU5ITHzv4Lc8NEskNEcsPkZYbU85cpUciLzBCf45CbmUpuZiq15Tl/9vm+gaFY6PePhf+Ro320Hj7Opl1tDI+cXvXK73MoyA6NtX4i2bEngdhfBKEU/SrLO9MjQ8Ql4dQAFUWZVBRl/tnnhkdG6Ooe4Mi48G872k9bVx9vHuimb2Bowv5ZacFo+I/7S6AoN415Bela+DzJ6X9fZA7y+3wU5IQpOMOdMnv6Bie0f061g5r2HuWVxsOMX/k2PyuV0kgGpZF0ygqi70vy03WPniShkBdJQBnhIBnhIFUlWX/2uaHhEdqP9XOo4wT723vY39bLvrYeGps7x1pAPsehKC9M2anwj72P5IR1vx6PUciLeEzA76M4L43ivDRW1BSMbR8aHuFw5wn2t0dDf39bLy2Hunlt55GxfVKCPublp08I/7JIOlnpKVpsJUHFFfLGmDuB62MfPmmt/Yox5grgbiAMPGSt/foM1Sgi0yDg98XaNhmsqSsa295/cogD7SfGgn9fWw+v7+ngD28cHNsnIxykLJJOaUEGpYWn2z5aR3fuO+v/UCzMrwJWAqPA08aYG4HvAJcBe4EnjTFXW2t/O5PFisj0C6UEWDAviwXzJrZ+uk+cHAv9/W297G/r4Q/bDjJwcnhsn/ys0IR2T1kkg+K8NPX755B4noYPAl+21p4EMMbsAGqBJmttc2zb/cBHAYW8iEdkpaWQVZlCXWXu2LaR0VE6j/Wzrz0a+vti4T++3+/3ORTlpVFakE5ZJJ3li4oozEzRNE+XOKOjo2ffK8YYUwP8EfghYKy1H49tvwL4irX2qji+zXygefKlishcNTQ8wv62HloPdtN66HjsfTeHOk4A0eCvrchlWU0By2siLKrMJRjQHTynqApoiXfnuJ9ajTH1wJPAPwBDRM/mT3GAkXf6ujPp6OhhZCT+J5i5JhLJpK3tuNtlzBkaj9OSdSzS/A51ZdnUlWWPbes/OURH7yAbXj/AjtYuHn52Fw/9bhfBgI+asmzqKnOpq8yjsjgDvy85WjxTfXz4fA75+ZO/+V28F14vAh4FvmitfdAYcxlQMm6XYuDApI8uIp4WSgmwojSX0tzofP8T/UPs2nuUHa1d7Gjt5NHf7wH2EE71Y8pzY6Gfy7xIuqZyTpN4LryWA78CbrDWPhfb/Er0U6aaaOvlJuC+GatSRDwhLRRgRU3B2NTO7t6T7HyrKxb6XWzZ3Q5AZlqQRRW51M2Phn5hTlhTOKconjP524EQcLcx5tS2HwO3ET27DwFPAY/MQH0i4mFZ6SmsqSsam9LZcayfnW91sb0leqZ/ag5/XlYqdWOhn0duZqqbZSeUSV14nSbzgWb15L1F43GaxmKiqY7H6Ogoh7v6omf5LZ3sfOsoPX3RO3cW5aVRV5nL4spcTEUOmWmJs+7uNPTkZ+bCq4jIbHIcZ+yVu+9dWcrI6Cj7jvSMtXbWNx7ihc37ASgvzBjr59eW5+hFWuNoJEQkIfgcZ+yune9fU8HQ8Aith46zvbWLna1dPLdpP8+8thef41BVkhlt7VTksrA0m5Rg8k7XVMiLSEIK+H0sLM1mYWk211w4n8GhYXbv72ZHayc7Wrt4av1bPPFyKwG/j+rSLBbPz2PJgjwqijKTauaOQl5EPCEY8I+1bCC6KMvp6ZpdPPbiHh57cQ8Z4SCL5+eypCqf+irvX8RVyIuIJ4VTAyyvLmB5dXS65rHek2xv6WTbnk4aWzp5dUd05k5pQTr1VXksqcqjtjzHc60dhbyIJIXs9BTW1heztr6Y0dFR9rX1sq25g8bmzrF+fsDvw5RnUx87yy+LpCf8/HyFvIgkHcdxKC/MoLwwg6vPr2RgcJhde4/S2NzJtuZOHn5+NzwffWI4dZa/eH4eWemJM1XzFIW8iCS91KCfpQvyWbogH4DO7n4aWzppbO5k6+52Xt52CICKooxY6OdTXZqdELdUVsiLiLxNXlaIS5bN45Jl8xgZGaX18HG2NUdD/5lX9/LbDW+REvSxqCJ37Ey/OC9tTrZ2FPIiIu/C53OoKsmiqiSLay6cT9/AEDvf6hpr7bz+ZgcQXTC9viqP+qp86ipzyQgHXa48SiEvIjIJ4dQAK2sirKyJAHDkaB/bY4H/2s4jvLj1II4DVSVZ1Mfm5leVZBHwu9PaUciLiJyDwpwwhStLec/KUoZHRmg+cHxs1s4T61v4zcsthFP9LKrIZemCfP768tqzfs/ppJAXEZkmfp+P6rJsqsuy+dAlC+jtH2RHS1esn9/B5qZ2FlbmUZ4XnrWaFPIiIjMkPRTkvEWFnLeokNHRUXr7h6iqyJvVu5TO/fk/IiIe4DiOKxdjFfIiIh6mkBcR8TCFvIiIhynkRUQ8TCEvIuJhCnkREQ9zY568H6L3g0h0XvgZppPG4zSNxUQaj4mmMh7jvmZSq5o4o6Ojkz7YOboYeGm2Dyoi4hGXAH+Id2c3Qj4VWA0cBIZn++AiIgnKD5QArwED8X6RGyEvIiKzRBdeRUQ8TCEvIuJhCnkREQ9TyIuIeJhCXkTEwxTyIiIeppAXEfEwLf8XJ2NMFvAy8FfW2hZjzBXA3UAYeMha+3VXC5xFxpg7getjHz5prf1Kko/Ht4DrgFHgXmvt3ck8HgDGmO8BBdba24wxK4CfAFnAi8BnrLVDrhY4S4wxzwOFwGBs098BC4GvA0HgX621P5rJGnQmHwdjzPlEX0ZcG/s4DNwHXAvUAauNMVe7V+HsiYXXVcBKYAWwyhhzI8k7HpcB7wOWAecBnzfGLCdJxwPAGHM5cOu4TfcDn7PW1gIO8ClXCptlxhiHaGYst9ausNauAPYBdxG9vcsK4NPGmMUzWYdCPj6fAj4LHIh9vAZostY2x85I7gc+6lZxs+wg8GVr7Ulr7SCwg+gDOSnHw1r7e+C9sZ+7kOhfxzkk6XgYY/KIhti3Yx9XAmFr7YbYLj8lScYCMLH3zxhjthpjPgdcATxnre201vYCjxD9K3DGqF0TB2vtJwGMOfV/xjyiYXfKQaBslstyhbW28dS/jTE1RNs2PyRJxwPAWjtojPkmcDvwHyTx4wO4B7gDKI99nMxjkQusAz5PtDXzAvAQfz4ea2ayCJ3JT42PaP/1FAcYcakWVxhj6oHfAf8A7CHJx8NaeycQIRputSTheBhjPgnstdauG7c5aX9XrLXrrbWfsNYes9a2A/cC32KWx0Nn8lOzj+jd4E4p5nQrx/OMMRcBjwJftNY+GOtLJ+V4GGMWASFr7RZr7QljzGNE//wef4fVZBmPG4ASY8wWIA/IIBpoyfrYuBhIHfek5wAtzPJ4KOSn5hXAGGOqgWbgJqIX2jzPGFMO/Aq4wVr7XGxz0o4HsAD4ZuwXepToxdZ7gO8m23hYa6889W9jzG3Ae6y1f2OM2WaMucha+0fgFuC3btU4y3KAbxljLiTarrkV+DhwvzEmAvQCHwE+PZNFqF0zBdbafuA2omez24GdRC+gJIPbgRBwtzFmS+ys7TaSdDystU8BTwKbgY3wGgEvAAAAaElEQVTAy9baB0nS8TiDm4HvG2N2Ej27/4HL9cwKa+0TTHxs3Bd7orsDeB7YAjxgrX11JuvQ/eRFRDxMZ/IiIh6mkBcR8TCFvIiIhynkRUQ8TCEvIuJhCnkREQ9TyIuIeJhCXkTEw/4/QC15DH5kBrgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同K对应的聚类的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-',label = 'CH_scores')\n",
    "\n",
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[ index[0]]\n",
    "\n",
    "print(Best_K)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从上面得出的最佳分类为5，而数据实际的类别为10，说明k-means聚类算法在数据集上得到的效果是有限的。有可能是数据样本过少造成的原因。也有可能是通过tfidf提取特征的原因。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
